output_results = "../results/"
output_data = "../results/"
ps_deseq <- readRDS(paste0(output_results, "Filtered/ps_deseq_friedfilt.rds"))
ps_css <- readRDS(paste0(output_results,"Filtered/ps_css_friedfilt.rds"))
ps_no_norm <- readRDS(paste0(output_results,"Filtered/ps_no_norm_friedfilt.rds"))
ps_not_norm_comp <- readRDS(paste0("../data/ps_no_norm_age_bf_filt_complete_family.rds"))
#remove samples that are too young and their paired sibling
#tooyoung <-ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Age..months. < 24)]
#family 65 should be removed according to >24 months criteria
#ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != unique(tooyoung), ps_not_norm_comp)
#List of individuals that were reported w/ autism, but was not classified as such through MARA and/or video classifier
#pheno_contrad <-read.csv("../phenotype_contradictions.csv")
#contradicting<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% as.character(pheno_contrad$host_name))])
#remove the whole family
#for (i in contradicting){
# ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)
#}
#Remove Breastfed and their families
breast_fed <- c("089_A","054_N", "158_N" )
b_fed<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% breast_fed)])
#remove the whole family
for (i in b_fed){
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)
}
#Remove Possible Contradictions and their families
possible_phen_contra <- c("020_A","131_A", "184_A" )
p_con<-unique(ps_not_norm_comp@sam_data$Family.group.ID[which(ps_not_norm_comp@sam_data$Host.Name %in% possible_phen_contra)])
#remove the whole family
for (i in p_con){
ps_not_norm_comp<- prune_samples(ps_not_norm_comp@sam_data$Family.group.ID != i, ps_not_norm_comp)
}
#For plots
ps_deseq@sam_data$Phenotype <- gsub("A", "ASD", ps_deseq@sam_data$phenotype)
ps_deseq@sam_data$Phenotype <- gsub("N", "TD", ps_deseq@sam_data$Phenotype)
ps_deseq@sam_data$Phenotype <-as.factor(ps_deseq@sam_data$Phenotype )
Compare resulting amplicon data between and within sample types by canonical correlation analysis, regression profiling, and visualization (e.g. non-metric multi-dimensional scaling [NMDS], principle coordinates of analysis, principle component analysis).
plotting_consPcoA <- function(ps, norm, variables){
#drop samples with NA values in contraints
keep <- as.vector(apply(ps@sam_data[, variables], 1, function(x) return(!any(is.na(x)))))
ps <- prune_samples(keep, ps)
#Ease of labeling
colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency...longitudinal.", "", variables)
variables <- gsub("frequency...longitudinal.", "", variables)
colnames(ps@sam_data)[colnames(ps@sam_data) == variables] <- gsub("frequency.", "", variables)
variables <- gsub("frequency.", "", variables)
#include only families that have all 6 timepoints, 3 for ASD and 3 for NT participants, for ease of visualization
fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
formula <- as.formula(paste("~", variables))
if(length(variables) > 1){
formula <- as.formula(paste("~", paste(variables, collapse = "+")))
}
ps_pcoa_ord <- ordinate(
physeq = ps_6fam,
method = "CAP",
distance = "bray",
formula = formula
)
p <- plot_ordination(
physeq = ps_6fam,
ordination = ps_pcoa_ord,
color = variables,
axes = c(1,2),
title= paste("Constrained PcoA",norm,"ordinated by \n", paste(variables, collapse = "\n"))) +
geom_point( size = 2) +
theme_minimal()+
theme(text = element_text(size =10), plot.title = element_text(size=10), legend.title = element_blank())
if(variables == "phenotype"){
p <- p + scale_color_manual(values=sgColorPalette)
}
#sum_pcoA_DesEq<-summary(ps_pcoa_ord)
erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
sampledf <- data.frame(sample_data(ps))
beta_di<-betadisper(erie_bray_sum_pcoA, ps@sam_data$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
return(to_return)
}
#With Deseq
DeSeq_distance <- plotting_consPcoA(ps_deseq, "Deseq", variables = "Phenotype")
# plot
DeSeq_distance[[1]]
#same with CSS
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "phenotype")
# plot
CSS_distance[[1]]
#With Deseq
DeSeq_distance2 <- plotting_consPcoA(ps_deseq, "Deseq", variables = "Family.group.ID")
# plot
DeSeq_distance2[[1]]
#same with CSS
CSS_distance <- plotting_consPcoA(ps_css, "CSS", variables = "Family.group.ID")
# plot
CSS_distance[[1]]
#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance2[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"),
xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")
#dev.off()
permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
#scores for social, learning, and anxiety
social <- c("Language.ability.and.use", "Conversation.ability", "Plays.imaginatively.when.alone", "Plays.imaginatively.with.others", "Plays.in.a.group.with.others", "Eye.contact.finding", "Picks.up.objects.to.show.to.others", "Imitation.behavior")
learning <- c("Understands.speech", "Childhood.behavioral.development.finding")
anxiety <- c("Repetitive.motion", "Sleep.pattern.finding", "Response.to.typical.sounds", "Self.injurious.behavior.finding")
permanova_res <- permanova_res[-which(permanova_res$Variable %in% c(anxiety, social, learning)), ] #First only include 95 samples because optional question
n_sample_min <- 350
variables <- as.character(permanova_res[permanova_res$TotalN > n_sample_min & permanova_res$pvalue_adj <= 0.05, ]$Variable)
plot_list <- list()
for(variable in variables){
res <- plotting_consPcoA(ps_deseq, "Deseq", variables = variable)
plot_list[[variable]] <- res[[1]]
}
plot_list[[length(plot_list) + 1]]<- DeSeq_distance[[1]]
#plot_grid(plotlist = plot_list[c(1:4)], nrow = 2, ncol = 2)
#plot_grid(plotlist = plot_list[c(5:8)], nrow = 2, ncol = 2)
#pdf(paste0(output_data, 'pcoa_top_permanova_variables.pdf'), width = 12, height = 9)
#plot_grid(plotlist = plot_list[c(5:8)], nrow = 2, ncol = 2)
pdf(paste0(output_data, 'pcoa_top_permanova_variables_vertical.pdf'), width = 8.5, height = 11, pointsize = 6)
plot_grid(plotlist = plot_list[c(1:7, length(plot_list))], ncol = 2)
dev.off()
## png
## 2
pdf(paste0(output_data, 'pcoa_all_permanova_variables_vertical.pdf'), width = 8.5, height = 11, pointsize = 6)
plot_grid(plotlist = plot_list[c(1:8)], ncol = 2)
plot_grid(plotlist = plot_list[c(9:16)], ncol = 2)
plot_grid(plotlist = plot_list[c(17:20)], ncol = 2)
#plot_grid(plotlist = plot_list[c(25:length(plot_list))], ncol = 2)
dev.off()
## png
## 2
Characterize and assess the diversity of each sample, and evaluate the extent of dissimilarity between the cohorts
ER <- estimate_richness(ps_not_norm_comp, measures=c("Observed", "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER$phenotype <- gsub("A", "ASD", ER$phenotype)
ER$phenotype <- gsub("N", "TD", ER$phenotype)
ER_long <- melt(ER, id.vars=c('rn', 'phenotype', "Family.group.ID", "Within.study.sampling.date"))
# plot
ggplot(data=ER_long[variable!='se.chao1'], aes(x=phenotype, y=value, fill=phenotype))+
geom_boxplot(width=0.7, outlier.colour='white')+
geom_jitter(size=1, position=position_jitter(width=0.1))+
xlab('')+ylab('')+
scale_fill_manual(values=sgColorPalette)+
theme_minimal()+facet_wrap(~variable, scales='free')
# run t-test to check significance
ttest=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
ASD<-ER_long[phenotype == "ASD",]
TD <-ER_long[phenotype == "TD",]
ASD <- ASD[order(ASD$Family.group.ID, ASD$Within.study.sampling.date, ASD$variable),]
TD <- TD[order(TD$Family.group.ID, TD$Within.study.sampling.date, TD$variable),]
ER_long_order <- rbind(TD, ASD)
ttest_res=t.test(value ~ phenotype, data=ER_long_order[ER_long_order$variable==alphad], var.equal=TRUE, paired = TRUE)
ttest=rbindlist(list(ttest, data.table(alpha_index=alphad, pvalue=ttest_res$p.value)))
}
pander(ttest)
| alpha_index | pvalue |
|---|---|
| Observed | 0.09121 |
| Chao1 | 0.09121 |
| Shannon | 0.5978 |
wilcox=NULL
for(alphad in c('Observed', 'Chao1', 'Shannon')){
ASD<-ER_long[phenotype == "ASD",]
TD <-ER_long[phenotype == "TD",]
ASD <- ASD[order(ASD$Family.group.ID, ASD$Within.study.sampling.date, ASD$variable),]
TD <- TD[order(TD$Family.group.ID, TD$Within.study.sampling.date, TD$variable),]
ER_long_order <- rbind(TD, ASD)
wilcox_res=wilcox.test(value ~ phenotype, data=ER_long_order[ER_long_order$variable==alphad], var.equal=TRUE, paired = TRUE)
wilcox=rbindlist(list(wilcox, data.table(alpha_index=alphad, pvalue=wilcox_res$p.value)))
}
pander(wilcox)
| alpha_index | pvalue |
|---|---|
| Observed | 0.008483 |
| Chao1 | 0.008483 |
| Shannon | 0.1188 |
#MARA
ER <- estimate_richness(ps_not_norm_comp, measures=c( "Chao1", "Shannon"))
ER <- cbind(ER, sample_data(ps_not_norm_comp)[row.names(ER), c("Mobile.Autism.Risk.Assessment.Score", "phenotype", "Family.group.ID", "Within.study.sampling.date")])
ER <- data.table(ER, keep.rownames = TRUE)
ER<-ER[ER$phenotype == "A",]
#ER_long <- melt(ER, id.vars=c('rn', "Mobile.Autism.Risk.Assessment.Score", "Family.group.ID", #"Within.study.sampling.date"))
# plot
chao<-ggplot(ER, aes(x=Mobile.Autism.Risk.Assessment.Score, y=Chao1, color=Chao1)) + geom_point()+ theme(legend.title = element_text("Chao1 Diversity Index")) + labs(color = "Chao1 Diversity Index", x = "Mobile Autism Risk Assessment Score", y = "Chao1 Diversity Index")
shan<-ggplot(ER, aes(x=Mobile.Autism.Risk.Assessment.Score, y=Shannon, color=Shannon)) + geom_point()+ theme(legend.title = element_text("Shannon Diversity Index")) + labs(color = "Shannon Diversity Index", x = "Mobile Autism Risk Assessment Score", y = "Shannon Diversity Index")
spearman.test(ER$Mobile.Autism.Risk.Assessment.Score, ER$Chao1, p=1)
## Rsquare F df1 df2 pvalue n
## 7.207538e-02 1.662218e+01 1.000000e+00 2.140000e+02 6.431700e-05 2.160000e+02
spearman.test(ER$Mobile.Autism.Risk.Assessment.Score, ER$Shannon, p=1)
## Rsquare F df1 df2 pvalue n
## 8.913118e-02 2.094053e+01 1.000000e+00 2.140000e+02 8.024563e-06 2.160000e+02
grid.arrange(shan, chao, ncol = 1)
shan<-shan+stat_cor(method="spearman",label.x = -6.75, label.y = 4.55, cor.coef.name = "R^2")
chao<-chao+stat_cor(method="spearman",label.x = -6.75, label.y = 400, cor.coef.name = "R^2")
stacked_plots <- shan + chao + plot_layout(ncol = 1)
stacked_plots
#Let's do a PcoA #not much differences
GP.ord <- ordinate(ps_deseq, "PCoA", "bray")
p2 = plot_ordination(ps_deseq, GP.ord, type="samples", color="phenotype") +
geom_point( size = 1)+
scale_color_manual(values=sgColorPalette)+
theme_minimal()
p2
# function to plot PCoA without NA points
wo_na_pcoa <- function(ps, pvar, ord_res){
# ord_res: ordinated object
keepid=!is.na(get_variable(ps, pvar)) &
get_variable(ps, pvar)!='NA' &
get_variable(ps, pvar)!=''
tmp_ps <- prune_samples(keepid, ps)
# get subset counts and metadata together
DF <- cbind(ord_res$vectors[row.names(sample_data(tmp_ps)), 1:2], sample_data(tmp_ps)[, pvar])
setnames(DF, pvar, 'testvar')
# get eigenvalues
eig=(ord_res$values$Eigenvalues/sum(ord_res$values$Eigenvalues))[1:2]*100
p <- ggplot(data=DF, aes(x=Axis.1, y=Axis.2, colour=testvar))+
geom_point(size=2)+
ggtitle(pvar)+
xlab(paste0('Axis.1 [', format(eig[1], digits=3), '%]'))+
ylab(paste0('Axis.2 [', format(eig[2], digits=3), '%]'))+
theme_minimal()+
theme(legend.title=element_blank(), legend.position="bottom")
print(p)
}
ord <- ordinate(ps_css, method = "PCoA", distance = "bray")
wo_na_pcoa(ps_css, 'Mobile.Autism.Risk.Assessment.Score', ord)
wo_na_pcoa(ps_deseq, 'Probiotic..consumption.frequency.', GP.ord)
# Anti.infective
wo_na_pcoa(ps_deseq, 'Anti.infective', GP.ord)
# Minimum.time.since.antibiotics
sample_data(ps_deseq)$Minimum.time.since.antibiotics <- as.numeric(sample_data(ps_deseq)$Minimum.time.since.antibiotics)
wo_na_pcoa(ps_deseq, 'Minimum.time.since.antibiotics', GP.ord)
permanova_pcut = 0.05
permanova_cut = 0.02
permanova_res <- read.csv(paste0(output_data, "PERMANOVA_noLR.csv"))
for(pvar in permanova_res[permanova_res$R2>permanova_cut & permanova_res$pvalue<permanova_pcut]$Variable[-which(permanova_res[permanova_res$R2>permanova_cut & permanova_res$pvalue<permanova_pcut]$Variable %in% c("High_Anxiety"))]){
wo_na_pcoa(ps_deseq, pvar, GP.ord)
}
ps_all<-readRDS("../results/Filtered/ps_allsamps_deseq_norm_friedfilt.rds")
breast_fed <- c("089_A","054_N", "158_N" )
ps_all@sam_data$breastfed <- rep("FALSE", length(rownames(ps_all@sam_data)))
ps_all@sam_data$breastfed[which(ps_all@sam_data$Host.Name %in% breast_fed)] <- rep("TRUE", length(ps_all@sam_data$breastfed[which(ps_all@sam_data$Host.Name %in% breast_fed)]))
GP.ord <- ordinate(ps_all, "PCoA", "bray")
p2 = plot_ordination(ps_all, GP.ord, type="samples", color="breastfed") +
geom_point( size = 1)+
scale_color_manual(values=sgColorPalette)+
theme_minimal()
p2
GP.ord_con <- ordinate(ps_all, "CAP", "bray", formula = ~breastfed)
p2_con = plot_ordination(ps_all, GP.ord_con, type="samples", color="breastfed") +
geom_point( size = 1)+
scale_color_manual(values=sgColorPalette)+
theme_minimal()
p2_con
saveRDS(p2, "../pcoa_diversity_plots_files/figure-html/unconstrained_pcoa_breastfed.rds")
saveRDS(p2_con, "../pcoa_diversity_plots_files/figure-html/constrained_pcoa_breastfed.rds")
ps_deseq_seafood<-subset_samples(ps_deseq, Seafood..consumption.frequency. != "NA")
GP.ord_con <- ordinate(ps_deseq_seafood, "CAP", "bray", formula = ~Seafood..consumption.frequency.)
p2_con = plot_ordination(ps_deseq_seafood, GP.ord_con, type="samples", color="Seafood..consumption.frequency.") +
ggtitle(label = "Seafood Consumption (Ordinated)")
#geom_point( size = 1)+
#scale_color_manual(values=sgColorPalette)+
#theme_minimal()
p2_con
ps_deseq_season<-subset_samples(ps_deseq, Season != "NA")
GP.ord_con <- ordinate(ps_deseq_season, "CAP", "bray", formula = ~Season)
p2_con = plot_ordination(ps_deseq_season, GP.ord_con, type="samples", color="Season") +
ggtitle(label = "Seasonality")
#geom_point( size = 1)+
#scale_color_manual(values=sgColorPalette)+
#theme_minimal()
p2_con
ps_deseq_income<-subset_samples(ps_deseq, Annual.household.income_rank != "NA")
GP.ord_con <- ordinate(ps_deseq_income, "CAP", "bray", formula = ~Annual.household.income_rank)
p2_con = plot_ordination(ps_deseq_income, GP.ord_con, type="samples", color="Annual.household.income_rank") +
ggtitle(label = "Annual.household.income_rank")
#geom_point( size = 1)+
#scale_color_manual(values=sgColorPalette)+
#theme_minimal()
p2_con
ps_deseq_income<-subset_samples(ps_deseq, Number.of.pet.reptiles != "NA")
GP.ord_con <- ordinate(ps_deseq_income, "CAP", "bray", formula = ~Number.of.pet.reptiles)
p2_con = plot_ordination(ps_deseq_income, GP.ord_con, type="samples", color="Number.of.pet.reptiles") +
ggtitle(label = "Number.of.pet.reptiles")
#geom_point( size = 1)+
#scale_color_manual(values=sgColorPalette)+
#theme_minimal()
p2_con
#Looking at NORMALIZATION
plotting_Fam_consPcoA <- function(ps,title){
fam_6<-names(table(sample_data(ps)$Family.group.ID)[table(sample_data(ps)$Family.group.ID) == 6])
ps_6fam<-prune_samples(sample_data(ps)$Family.group.ID %in% fam_6,ps )
sample_data(ps_6fam)$Family.group.ID <- paste0('fam', as.character(sample_data(ps_6fam)$Family.group.ID))
ps_pcoa_ord <- ordinate(
physeq = ps_6fam,
method = "CAP",
distance = "bray",
formula = ~ Family.group.ID
)
p<-plot_ordination(
physeq = ps_6fam,
ordination = ps_pcoa_ord,
color = "Family.group.ID",
axes = c(1,2),
title= paste("Constrained PcoA",title,"ordinated by families with all timepoints")
) +
geom_point( size = 2) +
theme_minimal()+
theme(text = element_text(size =10), plot.title = element_text(size=10), legend.position='none')
#sum_pcoA_DesEq<-summary(ps_pcoa_ord)
erie_bray_sum_pcoA <- phyloseq::distance(ps, method = "bray")
sampledf <- data.frame(sample_data(ps))
beta_di<-betadisper(erie_bray_sum_pcoA, sampledf$Family.group.ID)
to_return<-list()
to_return[[1]]<-p
to_return[[2]]<-beta_di
return(to_return)
}
#With Deseq
DeSeq_distance<-plotting_Fam_consPcoA(ps_deseq, "Deseq")
# plot
#DeSeq_distance[[1]]
#same with CSS
CSS_distance<-plotting_Fam_consPcoA(ps_css, "CSS")
# plot
#CSS_distance[[1]]
#the distance in those plot?
#average_distance_to_median
#pdf(file=paste0(output_data, "Figures/Distance_DeSeq_CSS_", Sys.Date(), ".pdf"))
boxplot(DeSeq_distance[[2]]$distances,CSS_distance[[2]]$distances, names=c("DeSeq", "CSS"),
xlab = "Type of Normalization", ylab = "Distance on Component 1 & 2", main ="Intragroup distance for each family")